Supplementary Figure 1

Reading post integrated data

Supplementary Figure 1 B | QC of cells from each donor and subset, nCount_RNA, nFeature_RNA and percent mitochondrial content is shown.

Tfh.Map@meta.data$celltype <- factor(Tfh.Map@meta.data$celltype, levels = c("Tfh1", "Tfh2",  "Tfh17", "PD1n"))
Tfh.Map@meta.data$donor_id <- factor(Tfh.Map@meta.data$donor_id, levels = c("donor0", "donor1", "donor2", "donor3"))
viol_cols_qc <- c("donor0"="#FFFFFF", "donor1"="#8F9A98", "donor2"="#DADEDD", "donor3"="#4A5251")

##Creating each plot
a <- VlnPlot(Tfh.Map, features = c("nCount_RNA"),
            cols = viol_cols_qc, group.by = "celltype", split.by = "donor_id",
            pt.size = -1,  flip = FALSE) 
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##       
## This message will be shown once per session.
# Extract max y value from the plot
max_a <- ceiling(max(ggplot_build(a)$data[[1]]$y, na.rm = TRUE))

# Modify the plot to only show the max y tick
af <-a + scale_y_continuous(breaks = max_a, labels = max_a) +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
b <- VlnPlot(Tfh.Map, features = c("nFeature_RNA"),
             cols = viol_cols_qc, group.by = "celltype", split.by = "donor_id",
             pt.size = -1,  flip = FALSE) 
# Extract max y value from the plot
max_b <- ceiling(max(ggplot_build(b)$data[[1]]$y, na.rm = TRUE))

# Modify the plot to only show the max y tick
bf <- b + scale_y_continuous(breaks = max_b, labels = max_b) +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
c <- VlnPlot(Tfh.Map, features = c("percent.mt"),
             cols = viol_cols_qc, group.by = "celltype", split.by = "donor_id",
             pt.size = -1,  flip = FALSE) 
# Extract max y value from the plot
y_vals <- ggplot_build(c)$data[[1]]$y
max_y <-  ceiling(max(y_vals, na.rm = TRUE))

# Modify the plot to only show the max y tick
cf <- c + scale_y_continuous(breaks = 10.21729, labels = max_y) +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
final <- af/bf/cf

print(final)

Reading raw data

Input can be found in zenodo https://zenodo.org/records/14847353 Raw data have been deposited in the NCBI data base under accession number GSE272939 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE272939 and accession number GSE253661 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE253661

# We use the for loop to execute two commands for each sample - (1) read in the count data (Read10X()) and (2) create the Seurat objects from the read in data (CreateSeuratObject()):

##Formating input
# Set working directory (customize this as needed)

# Get list of .gz files
file.names <- list.files(pattern = '\\.gz$', recursive = TRUE)

# Function to extract the part between first pair of underscores
extract_key <- function(filename) {
  matches <- regmatches(filename, regexpr("_(.*?)_", filename, perl = TRUE))
  key <- gsub("_", "", matches)  # remove underscores
  return(key)
}

# Extract keys
keys <- sapply(file.names, extract_key)

# Unique folder names based on extracted keys
unique.keys <- unique(keys)

# Create folders if they don't exist
sapply(unique.keys, function(key) {
  if (!dir.exists(key)) dir.create(key)
})

# Copy files into their corresponding folders
for (i in seq_along(file.names)) {
  original_path <- file.path(getwd(), file.names[i])
  key <- keys[i]
  
  # Extract new file name by removing prefix (up to second underscore)
  new_name <- sub("^[^_]+_[^_]+_", "", basename(file.names[i]))
  
  dest_path <- file.path(getwd(), key, new_name)
  file.copy(original_path, dest_path, overwrite = TRUE)
}

# Create Seurat objects for each folder
seurat_objects <- list()
for (key in unique.keys) {
  folder_path <- file.path(getwd(), key)
  seurat_data <- Read10X(data.dir = folder_path)
  seurat_obj <- CreateSeuratObject(counts = seurat_data,
                                   min.cells = 3,
                                   min.features = 200,
                                   project = key)
  seurat_objects[[key]] <- seurat_obj
  assign(key, seurat_obj)
}

##Changing suffix to make the cells uniques
seurat_objects$PD1Neg@meta.data$cellname <- colnames(seurat_objects$PD1Neg)
seurat_objects$Tfh1@meta.data$cellname <- colnames(seurat_objects$Tfh1)
seurat_objects$Tfh2@meta.data$cellname <- colnames(seurat_objects$Tfh2)
seurat_objects$Tfh17@meta.data$cellname <- colnames(seurat_objects$Tfh17)

seurat_objects$PD1Neg@meta.data$cellname <- gsub("*-1", "-1", seurat_objects$PD1Neg@meta.data$cellname)
seurat_objects$Tfh1@meta.data$cellname  <- gsub("*-1", "-2", seurat_objects$Tfh1@meta.data$cellname)
seurat_objects$Tfh2@meta.data$cellname  <- gsub("*-1", "-3", seurat_objects$Tfh2@meta.data$cellname)
seurat_objects$Tfh17@meta.data$cellname <- gsub("*-1", "-4", seurat_objects$Tfh17@meta.data$cellname)

##adding information as character
vector1 <- seurat_objects$PD1Neg@meta.data$cellname
vector2 <- seurat_objects$Tfh1@meta.data$cellname 
vector3 <- seurat_objects$Tfh2@meta.data$cellname
vector4 <- seurat_objects$Tfh17@meta.data$cellname

##Renaming cells in the object
seurat_objects$PD1Neg <- RenameCells(seurat_objects$PD1Neg, new.names = vector1)
seurat_objects$Tfh1 <- RenameCells(seurat_objects$Tfh1, new.names = vector2)
seurat_objects$Tfh2 <- RenameCells(seurat_objects$Tfh2, new.names = vector3)
seurat_objects$Tfh17 <- RenameCells(seurat_objects$Tfh17, new.names = vector4)

##Renaming rows in metadata
rownames(seurat_objects$PD1Neg@meta.data) <- seurat_objects$PD1Neg@meta.data$cellname
rownames(seurat_objects$Tfh1@meta.data) <- seurat_objects$Tfh1@meta.data$cellname
rownames(seurat_objects$Tfh2@meta.data) <- seurat_objects$Tfh2@meta.data$cellname
rownames(seurat_objects$Tfh17@meta.data) <- seurat_objects$Tfh17@meta.data$cellname

merged_seurat <- merge(x = seurat_objects$PD1Neg,
                       y = c(seurat_objects$Tfh1,seurat_objects$Tfh2,seurat_objects$Tfh17))

##reading donor information

donor.ids <- read.csv("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_GEX_4.4/input/donor_ids.csv") 
colnames(donor.ids)
##Changing the colname in donor_ids
donor_ids <- donor.ids %>%
  plyr::rename(c(
    "cell" = "cellname",
    "donor_id" = "vireo_donor_id",
    "prob_max" = "vireo_prob_max",
    "prob_doublet" = "vireo_prob_doublet",
    "n_vars" = "vireo_n_vars",
    "best_singlet" = "vireo_best_singlet",
    "best_doublet" = "vireo_best_doublet"))
head(donor_ids)
#91347     7
#when you join the donor_id information into the Meta.data it overwrites the rownames in the meta.data and colnames in the object.  These are needed to map!!

##storing cellname information (i.e. object's colname and metadata rownames)
merged_seurat@meta.data$cellname <- rownames(merged_seurat@meta.data)

vectorrux <- colnames(merged_seurat)

##adding donor information
merged_seurat@meta.data <- plyr::join(merged_seurat@meta.data, donor_ids, by = "cellname")
##checking the information was added
head(merged_seurat@meta.data)

##add the rownames back to the metadata
rownames(merged_seurat@meta.data) <- merged_seurat@meta.data$cellname

##add the colnames back to object columns
merged_seurat <- RenameCells(merged_seurat, new.names = vectorrux)

#checking
tail(colnames(merged_seurat))
tail(rownames(merged_seurat@meta.data))
table(merged_seurat@meta.data$vireo_donor_id, useNA = "ifany")
##rownames are back

#Calculate per mito
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
merged_seurat[["percent.mt"]] <- PercentageFeatureSet(merged_seurat, pattern = "^MT-")

merged_seurat$celltype_donor<- paste0(merged_seurat$orig.ident, "_", merged_seurat$vireo_donor_id)
##save RDS file
saveRDS(merged_seurat, file = "/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_GEX_4.4/input/Thf_diversity_merged_raw.rds")
merged_seurat <- readRDS("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_GEX_4.4/input/Thf_diversity_merged_raw.rds")

####Supplementary Figure 1 C Dimplot pre integration
merged_seurat_s <- subset(merged_seurat, vireo_donor_id!= "doublet")

# Get gene names
gene_list <- rownames(merged_seurat_s)

# Find TCR gene indices using regex
tcr_genes <- grep("^TRA[VJD]|^TRB[VJD]", gene_list, value = TRUE)

# Check how many are found
cat("TCR genes found:", length(tcr_genes), "\n")
## TCR genes found: 96
#96 found

# Remove TCR genes from the object
merged_seurat_s_f <- subset(merged_seurat_s, features = setdiff(gene_list, tcr_genes))##all rows in gene_list that arent in tcr_genes
Running standard workflow
##Log Norm
merged_seurat_s_f <- NormalizeData(merged_seurat_s_f)
## Normalizing layer: counts.PD1Neg
## Normalizing layer: counts.Tfh1
## Normalizing layer: counts.Tfh2
## Normalizing layer: counts.Tfh17
merged_seurat_s_f <- ScaleData(merged_seurat_s_f)
## Centering and scaling data matrix
merged_seurat_s_f <- FindVariableFeatures(merged_seurat_s_f)
## Finding variable features for layer counts.PD1Neg
## Finding variable features for layer counts.Tfh1
## Finding variable features for layer counts.Tfh2
## Finding variable features for layer counts.Tfh17
merged_seurat_s_f <- RunPCA(merged_seurat_s_f, npcs = 30, verbose = FALSE)
ElbowPlot(merged_seurat_s_f, ndims = 30)## we'll go with 30.

merged_seurat_s_f <- RunUMAP(merged_seurat_s_f, reduction = "pca", dims = 1:30)
## 16:40:26 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:40:26 Read 7373 rows and found 30 numeric columns
## 16:40:26 Using Annoy for neighbor search, n_neighbors = 30
## 16:40:26 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:40:27 Writing NN index file to temp file /tmp/RtmpfR1vt6/file2e056c497f917f
## 16:40:27 Searching Annoy index using 1 thread, search_k = 3000
## 16:40:29 Annoy recall = 100%
## 16:40:30 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 16:40:31 Initializing from normalized Laplacian + noise (using RSpectra)
## 16:40:31 Commencing optimization for 500 epochs, with 310870 positive edges
## 16:40:31 Using rng type: pcg
## 16:40:40 Optimization finished
merged_seurat_s_f <- FindNeighbors(merged_seurat_s_f, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN

Supplementary Figure 1 C top | UMAP of data before and after integration for donor.

donor_cols = c("donor0"="#89D9C9", "donor1" = "#78BBDA", "donor2"="#BDC1E0", "donor3"="#FF7C71")

Idents(object = merged_seurat_s_f) <- "vireo_donor_id"
top <- DimPlot(merged_seurat_s_f,  pt.size = 2, group.by = "vireo_donor_id", 
        cols = c("#89D9C9", "#78BBDA", "#BDC1E0", "#FF7C71")) 

Idents(object = Tfh.Map) <- "donor_id"


bottom <- DimPlot(Tfh.Map, label = FALSE, pt.size = 2, group.by = "donor_id", 
        cols = c("#89D9C9", "#78BBDA", "#BDC1E0", "#FF7C71" ))

print(top/bottom)

#### Supplementary Figure 1 D The data was downloaded from https://zenodo.org/records/11355186 publication is Massoni-Badosa R,.. et al. An atlas of cells in the human tonsil. Immunity. 2024 Feb 13;57(2):379-399.e18. doi: 10.1016/j.immuni.2024.01.006. Epub 2024 Jan 31. PMID: 38301653; PMCID: PMC10869140.

Getting data for plot 1
Getting data for plot 2

Supplementary Figure 1D top left | UMAP of the original annotation from the human tonsil atlas

topR <- DimPlot_scCustom(seurat_object = tonsil.tfh, group.by = "annotation_20230508", pt.size = 2, label = T,
                 colors_use = DiscretePalette_scCustomize(num_colors = 15, palette = "ditto_seq"))


topL <- DimPlot_scCustom(seurat_object = tonsil.tfh, group.by = "customclassif", pt.size = 2, label = T,colors_use = c("#1B685E","#AEB2D8","#C9812B", "#914F1F", "#464BA7"))  

metadata <- tonsil.tfh@meta.data

# Define custom colors
colors_use <- c("#fdbf6f", "#a6cee3", "#41ae76", "#fff7bc", "#2171b5", "#fc4e2a")

# Count cells within each customclassif and annotation_20230508 group
metadata_summary <- metadata %>%
  count(customclassif, annotation_20230508)  # Summarize counts

bottom <- ggplot(metadata_summary, aes(x = customclassif, y = n, fill = annotation_20230508)) +
  geom_bar(stat = "identity", position = "stack") +  # Stacked bar for counts
  labs(x = "Custom Classification", y = "Cell Count", fill = "Annotation") +
  scale_fill_manual(values = colors_use) + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for clarity

print((topR +topL) /bottom)

Supplementary Figure 1 E | Expression of genes with various roles in Tfh and CD4 T cell development across different Tfh cell clusters.

human_colors_list <- c("#9e0142", "#f4a582", "#5e4fa2", "#66c2a5", "#e6f598", 
                       "#74add1", "#8c510a", "#bf812d", "#fee090", "#01665e", "#b2abd2")


Tfh.ref.markers <- c("EOMES", "RUNX3","NKG7",
                     "TGFB1", "TOX", "SH2D1A", "PDCD1",
                     "TOX2", "MAF", "TCF7", 
                     "LGALS3", "TNFRSF18", "BHLHE40", "CCR4", "ID2")

Stacked_VlnPlot(Tfh.Map, features = Tfh.ref.markers, x_lab_rotate = TRUE,
                colors_use = human_colors_list, group.by = "integratedcluster")

#### Supplementary Figure 1 F | Relationship between cell sorted identity and transitional cell cluster identify.

all_metadata <- data.frame(Tfh.Map@meta.data)
prop_table <- as.data.frame(table(all_metadata$celltype, all_metadata$integratedcluster))
colnames(prop_table)[1] <- "Sort_Cell_Identity"
colnames(prop_table)[2] <- "scRNAseq_clusters"

# Convert to factor and keep level order
# Rename the Sort_Cell_Identity labels by adding an underscore
prop_table$Sort_Cell_Identity <- gsub("^Tfh(\\d+)$", "Tfh_\\1", prop_table$Sort_Cell_Identity)

# Set the desired order with updated labels
prop_table$Sort_Cell_Identity <- factor(prop_table$Sort_Cell_Identity,
                                        levels = c("Tfh_1", "Tfh_2", "Tfh_17", "PD1n"))
prop_table$scRNAseq_clusters <- factor(prop_table$scRNAseq_clusters,
                                       levels = c("Tfh1", "Tfh2", "Tfh17", "Tfh1_CM", "Tfh1.17",
                                                  "Tfreg", "IFNI", "heatshock", "ribosomal", "cluster2", "cluster3"))

# Define named color vectors (order matters!) 
colors_forthis = c(
  "Tfh_1"  = "#EC6E6E",
  "Tfh_2"  = "#5EA5D1",
  "Tfh_17" = "#60B275",
  "PD1n"  = "#D0BFDF",
  "Tfh1"  = "#A62241",
  "Tfh2"  = "#464BA7",
  "Tfh17" = "#64CBA9",
  "Tfh1_CM" = "#FFA582",
  "Tfh1.17"   = "#E5EAA0",
  "Tfreg"     = "#61A1BE",
  "IFNI"      = "#914F1F",
  "heatshock" = "#C9812B",
  "ribosomal" = "#FFDF94",
  "cluster2"  = "#1B685E",
  "cluster3"  = "#AEB2D8")


ggplot(data = prop_table,
       aes(axis1 = Sort_Cell_Identity, axis2 = scRNAseq_clusters, y = Freq)) +
  scale_x_discrete(limits = c("Sort_Cell_Identity", "scRNAseq_clusters"), expand = c(.2, .05)) +
  geom_alluvium(aes(fill = Sort_Cell_Identity)) +
  geom_stratum(aes(stratum = after_stat(stratum), fill = after_stat(stratum)), width = 1/4) +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
  scale_fill_manual(values = colors_forthis) +
  theme_minimal()

Supplementary Figure 1 G | Pathway analysis of top 25 pathways curated from cluster marker genes of Tfh1_CCR7 and Tfh1_cytotoxic clusters.

df <- read.csv("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/TfhMap_ClusterMarker_Pathways_190624.csv", sep =',', header = TRUE)
### removed NaN and 0 Z-score
df$direction <- ifelse(df$z.score > 0, "upregulated", "downregulated")

Tfh_colors <- c("#9e0142", "#f4a582", "#5e4fa2", "#66c2a5", "#bf812d")

### plot top 25 upregulated for Tfh1 + Tfh1CM + Tfh2 + Tfh17 
df.1 <- df[df$CellType == "Tfh1",]
df.1 <- df.1[1:25,]

df.2 <- df[df$CellType == "Tfh1_CM",]
df.2 <- df.2[1:25,]

df.3 <- df[df$CellType == "Tfh2",]
df.3 <- df.3[1:25,]

df.4<- df[df$CellType == "Tfh17",]
df.4 <- df.4[1:25,]

#merge
C2 <- rbind(df.1, df.2, df.3, df.4)

## getting just Tfh1 and Tfh1CM (16 Feb 2025)
C2sub <- C2[C2$CellType %in% c("Tfh1", "Tfh1_CM"), ]

C2_max <- C2sub %>%
  group_by(Ingenuity.Canonical.Pathways) %>%
  summarise(max_zscore = max(z.score))

C2_merged <- left_join(C2sub, C2_max, by = "Ingenuity.Canonical.Pathways")

ggplot(C2_merged, aes(x = reorder(Ingenuity.Canonical.Pathways, max_zscore), y = z.score, fill = CellType, group = CellType)) + 
  geom_bar(stat = "identity", alpha = .6, position = "dodge", width = 0.8) +
  scale_fill_manual(values = Tfh_colors, name = "celltype") +
  coord_flip() + 
  xlab("") + 
  scale_y_continuous(position = "right", limits = c(0, 5)) + 
  theme_cowplot(12) + 
  theme(axis.text.y = element_text(size = 10))

Getting data for supplementary figure 1 H and I.

Input data, can be downloaded from the NCBI data base under accession number GSE272939 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE272939

setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) # This only works if you are using RStudio

file.names <- list.files(pattern = 'filtered_contig_annotations.csv', recursive = TRUE)
names <- list.files(path="VDJ_Mode")

file_seq <- 1:length(file.names)

if(any(grepl("*.csv", file.names))==TRUE){
  tfh_tcr <- file.names %>% 
    lapply(., function(i){
      fread(input = i)
    }) %>%
    setNames(names)
}

tfh_tcr <- combineTCR(tfh_tcr,
                      samples = names,
                      removeNA = TRUE,
                      removeMulti = TRUE,
                      filterMulti = TRUE)


## Now we need to edit the barcode, delete everything before "v3_"

tfh_tcr <- tfh_tcr %>% 
  purrr::map(~.x %>% mutate(barcode = gsub(".*v3_","",barcode)))


tfh_tcr <- tfh_tcr[c("Tfh1_vdjMode_BoyleLab_vdj_ref_v3","Tfh2_vdjMode_BoyleLab_vdj_ref_v3",
                     "Tfh17_vdjMode_BoyleLab_vdj_ref_v3","PD1Neg_vdjMode_BoyleLab_vdj_ref_v3")]

tfh_tcr <- tfh_tcr %>%
  purrr::map2(., file_seq, ~mutate(.x, barcode = sub(1, .y, barcode)))


tfh_tcr_db0 <- lapply(names(tfh_tcr), function(name) {
                df <- tfh_tcr[[name]]
                df$day <- name
                colnames(df)[1] <- "barcode"
                colnames(df)[2] <- "sample"
                colnames(df)[3] <- "TCR1"
                colnames(df)[4] <- "cdr3_aa1"
                colnames(df)[5] <- "cdr3_nt1"
                colnames(df)[6] <- "TCR2"
                colnames(df)[7] <- "cdr3_aa2"
                colnames(df)[8] <- "cdr3_nt2"
                colnames(df)[9] <- "CTgene"
                colnames(df)[10] <- "CTnt"
                colnames(df)[11] <- "CTaa"
                colnames(df)[12] <- "CTstrict"
                return(df)
})        
tfh_tcr_db = as.data.frame(do.call(rbind, tfh_tcr_db0))

#write.csv(tfh_tcr_db, "/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/tfh_tcr_db_30325.csv")
#Combining expression and VDJ output
tfh_rep <- combineExpression(tfh_tcr, Tfh.Map,
                             cloneCall="aa", 
                             proportion = FALSE, 
                             cloneSize=c(Single=1, Small=5, Medium=20, Large=100, Hyperexpanded=500))

tfh_rep@meta.data$integratedcluster <-  factor(tfh_rep@meta.data$integratedcluster, 
                                               levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17", "Tfh1.17", "Tfreg", "IFNI", 
                                                          "heatshock", "ribosomal", "cluster2", "cluster3"))

cluster_col <- c("Tfh1" = "#9e0142", "Tfh1_CM" = "#f4a582",
                 "Tfh2" =  "#5e4fa2", "Tfh17" = "#66c2a5", 
                 "Tfh1.17" = "#e6f598", "Tfreg" = "#74add1", 
                 "IFNI" = "#8c510a", "heatshock" = "#bf812d",
                 "ribosomal"= "#fee090","cluster2" = "#01665e", "cluster3" = "#b2abd2")

cluster_orders <- c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17", "Tfh1.17", "Tfreg", "IFNI", "heatshock", "ribosomal", "cluster2", "cluster3")
cell.prop <- tfh_rep@meta.data %>% 
  mutate(TCR = case_when(is.na(CTaa) == TRUE ~ "No",
                         TRUE ~ "Yes"))

cell.prop <- cell.prop %>% 
  group_by(cloneSize) %>%
  summarise(n = n())

Supplementary Figure 1 H | Frequency of individual clones that occurred with clonal size of 1, 2, 3, 4, >5 in each subset.

clone_order <- rev(mixedsort(unique(filter(tfh_rep@meta.data, !is.na(cloneType))$cloneType)))

tfh_rep@meta.data <- tfh_rep@meta.data %>% 
  mutate(cloneSize = case_when(clonalFrequency == 1 ~ "1",
                               clonalFrequency == 2 ~ "2",
                               clonalFrequency == 3 ~ "3",
                               clonalFrequency == 4 ~ "4",
                               clonalFrequency > 5 ~ ">5")) %>% 
  mutate(cloneSize = factor(cloneSize, levels = c('1','2','3','4','>5')))

ggplot(filter(tfh_rep@meta.data, !is.na(cloneSize)), 
       aes(x = integratedcluster, fill = cloneSize))+
  geom_bar(colour = "black", position = position_stack(reverse = TRUE))+
  theme_classic()+
  scale_fill_manual(values = c("grey","#ABDDA4", "#66C2A5", "#3288BD", "#5E4FA2"))+
  labs(y = "Count")+
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle=90))

## Removing singletons
tfh_rep_s <- subset(tfh_rep, subset = clonalFrequency != 1)

Supp Figure 1 I | Circos plot of clonal overlap between subsets.

No clones were shared across donors. Expanded clonotypes (n³2) with only same fates (exist within the same subset), or different fates are shown.

# Create  a new column donor_clust for ChordDiagram "sector" 
tfh_rep_s@meta.data$donor_clust <- paste(tfh_rep_s@meta.data$donor_info, tfh_rep_s@meta.data$integratedcluster, sep="_")

# Create circos df using getCirclize function
tfh_circos <- getCirclize(tfh_rep_s, group.by = "donor_clust") %>% 
  filter(., value!=0)

# Metadata on naming the group
df_names <- tfh_rep_s@meta.data %>% 
  select(integratedcluster, donor_info, donor_clust) %>% 
  distinct(donor_clust, .keep_all = TRUE) %>% 
  remove_rownames() %>% 
  arrange(., factor(integratedcluster))

# We need to match color donor_clust variables based on cell type
# Currently done manually, need to find easier way
names <- unique(c(tfh_circos$from, tfh_circos$to))

grid.col <- case_when(grepl("_Tfh1$", names) ~ "#9e0142",
                      grepl("_Tfh1_CM$", names) ~ "#f4a582",
                      grepl("Tfh2", names) ~ "#5e4fa2",
                      grepl("Tfh17", names) ~ "#66c2a5",
                      grepl("Tfh1.17", names) ~ "#e6f598",
                      grepl("Tfreg", names) ~ "#74add1",
                      grepl("IFNI", names) ~ "#8c510a",
                      grepl("heatshock", names) ~ "#bf812d",
                      grepl("ribosomal", names) ~ "#fee090",
                      grepl("_cluster2$", names) ~ "#01665e",
                      grepl("_cluster3$", names) ~ "#b2abd2",
                      TRUE ~ NA)

names(grid.col) <- names

# We want to split the circos based on donor grouping
group = structure(df_names$donor_info, names = df_names$donor_clust)
group = factor(group, levels = c("donor0", "donor1", "donor2","donor3"))

# Now plot backbone of circos

chordDiagram(tfh_circos, 
             grid.col = grid.col,
             annotationTrack = "grid",
             preAllocateTracks = list(list(track.height = 0.075),
             #                          # list(track.height = 0.075),
                                      list(track.height = 0.001)),
             annotationTrackHeight = mm_h(5),
             group = group,
             big.gap = 20, small.gap = 1,
             direction.type = c("diffHeight", "arrows"))

# Now manually name the inner sector based on cell type
names <- data.frame(donor_clust = get.all.sector.index()) %>% 
  left_join(., df_names, by = "donor_clust")
names <- names$integratedcluster

for(i in seq_along(names)){
  si <- get.all.sector.index()[i]
  nm <- names[i]
  
    xlim = get.cell.meta.data("xlim", sector.index = si, track.index = 2)
    ylim = get.cell.meta.data("ylim", sector.index = si, track.index = 2)
    circos.text(mean(xlim), mean(ylim), nm, sector.index = si, track.index = 2, 
        facing = "outside", niceFacing = TRUE, col = "black", cex = 0.5)
}

highlight.sector(filter(df_names, donor_info == 'donor0')$donor_clust, track.index = 1, border = "black", col = "white", text = "Donor 0", cex = 0.75, text.col = "black", niceFacing = TRUE)
highlight.sector(filter(df_names, donor_info == 'donor1')$donor_clust, track.index = 1, border = "black", col = "white", text = "Donor 1", cex = 0.75, text.col = "black", niceFacing = TRUE)
highlight.sector(filter(df_names, donor_info == 'donor2')$donor_clust, track.index = 1, border = "black", col = "white", text = "Donor 2", cex = 0.75, text.col = "black", niceFacing = TRUE)
highlight.sector(filter(df_names, donor_info == 'donor3')$donor_clust, track.index = 1, border = "black", col = "white", text = "Donor 3", cex = 0.75, text.col = "black", niceFacing = TRUE)

circos.clear()

dev.set()
## png 
##   2

Supplementary Figure 4

Reading data for Supplementary Figure 4
Tfh.IBSM <- readRDS("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/Tfh.IBSM.2_REANNOTATION_POSTNORM_V2_200724.rds")

DefaultAssay(Tfh.IBSM) <- "integrated"
# remove cluster 13, 30, 24, 29, 23, 17 bc they are low quality clusters

Tfh.IBSM.2 <- subset(Tfh.IBSM, subset = integrated_snn_res.2.2 %in% c(0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,18,19,20,21,22,25,26,27,28))

Supplementary Figure 4 C /| QC of data from each donor across time points nCount_RNA, nFeature_RNA and percent mitochondrial content is shown

Tfh.IBSM@meta.data$day <- factor(Tfh.IBSM@meta.data$day, levels = c("day0", "day8",  "day16", "day36"))
Tfh.IBSM@meta.data$donor_id <- factor(Tfh.IBSM@meta.data$donor_id, levels = c("donor0", "donor1", "donor2", "donor3"))
viol_cols_QC <- c("donor0"="#FFFFFF", "donor1"="#8F9A98", "donor2"="#DADEDD", "donor3"="#4A5251")

##Creating each plot
A <- VlnPlot(Tfh.IBSM, features = c("nCount_RNA"),
            cols = viol_cols_QC, group.by = "day", split.by = "donor_id",
            pt.size = -1,  flip = FALSE) 
# Extract max y value from the plot
max_A <- ceiling(max(ggplot_build(A)$data[[1]]$y, na.rm = TRUE))

# Modify the plot to only show the max y tick
Af <-A + scale_y_continuous(breaks = max_A, labels = max_A) +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
B <- VlnPlot(Tfh.IBSM, features = c("nFeature_RNA"),
             cols = viol_cols_QC, group.by = "day", split.by = "donor_id",
             pt.size = -1,  flip = FALSE) 
# Extract max y value from the plot
max_B <- ceiling(max(ggplot_build(B)$data[[1]]$y, na.rm = TRUE))

# Modify the plot to only show the max y tick
Bf <- B + scale_y_continuous(breaks = max_B, labels = max_B) +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
C <- VlnPlot(Tfh.IBSM, features = c("percent.mt"),
             cols = viol_cols_QC, group.by = "day", split.by = "donor_id",
             pt.size = -1,  flip = FALSE) 
# Extract max y value from the plot
Y_vals <- ggplot_build(C)$data[[1]]$y
max_Y <-  ceiling(max(Y_vals, na.rm = TRUE))

# Modify the plot to only show the max y tick
Cf <- C + scale_y_continuous(breaks = max_Y, labels = max_Y) +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
Final <- Af/Bf/Cf

print(Final)

Supplementary Figure 4 D /| Data before and after integration based on donor and CHMI day.

right4d <- DimPlot_scCustom(seurat_object = Tfh.IBSM, group.by = "donor_day", pt.size = 2, colors_use = DiscretePalette_scCustomize(num_colors = 16,palette = "stepped"))

print(right4d)

Supplementary Figure 4 E /| Over clustering of data prior to scType label transferer.

DimPlot_scCustom(seurat_object = Tfh.IBSM, group.by = "integrated_snn_res.2.2", pt.size = 2, label = T,colors_use = DiscretePalette_scCustomize(num_colors = 31,palette = "varibow"))

###### scType label..... 
# load gene set preparation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
# load cell type annotation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")

db_ = "/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/TfhMap_ClusterMarkers_DB_Reannotated_postnorm_190724.xlsx";
tissue = "Immune system"

# prepare gene sets
gs_list = gene_sets_prepare(db_, tissue)

# get cell-type by cell matrix
es.max = sctype_score(scRNAseqData = Tfh.IBSM.2[["integrated"]]@scale.data, scaled = TRUE, 
                      gs = gs_list$gs_positive, gs2 = gs_list$gs_negative)  

#write.csv(es.max, "TfhIBSM_scType_esmax_scoringpercell_310524.csv")
# merge by cluster
cL_resutls = do.call("rbind", lapply(unique(Tfh.IBSM.2@meta.data$integrated_snn_res.2.2), function(cl){
  es.max.cl = sort(rowSums(es.max[ ,rownames(Tfh.IBSM.2@meta.data[Tfh.IBSM.2@meta.data$integrated_snn_res.2.2==cl, ])]), decreasing = !0)
  head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(Tfh.IBSM.2@meta.data$integrated_snn_res.2.2==cl)), 10)
}))

sctype_scores = cL_resutls %>% group_by(cluster) %>% top_n(n = 1, wt = scores)  

# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "Unknown"

Tfh.IBSM.2@meta.data$customclassif = ""
for(j in unique(sctype_scores$cluster)){
  cl_type = sctype_scores[sctype_scores$cluster==j,]; 
  Tfh.IBSM.2@meta.data$customclassif[Tfh.IBSM.2@meta.data$integrated_snn_res.2.2 == j] = as.character(cl_type$type[1])
}
# write.csv(cL_resutls, "TfhIBSM_scType_cL_results_postnorm_200724.csv")
# write.csv(sctype_scores, "TfhIBSM_scType_sctype_scores_postnorm_200724.csv")

Supplementary Figure 4 F /| scType was used to label transfer cell signatures (top 20 up and down regulated genes)for each cell subset identified in healthy ‘map’ data onto over-clustered CHMI data. Each cell cluster was scored for each reference signature (left), and the majority score (right) used to collapse clusters into cell subsets.

####### plotting sctype scores.... 

#cL_df <- cL_resutls
#cL_df <- read.csv("TfhIBSM_scType_cL_results_postnorm_200724.csv")
cL_df <- read.csv("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_GEX_4.4/TfhIBSM_scType_clresults_scoringpercluster_310524.csv")
#cL_df <- cL_df[, -c(1,5)]

cL_df$type <- factor(cL_df$type, levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17",
                                            "IFNI", "heatshock", "ribo", "cluster2", "cluster3"))


Tfh_colors_list <- c("#9e0142", "#f4a582", "#5e4fa2", "#66c2a5", 
                     "#8c510a", "#bf812d", "#fee090", "#01665e", "#b2abd2")


####Supplementary Figure 4F 
number_ticks <- function(n) {function(limits) pretty(limits, n)}

# ##"TfhIBSM_scType_output_cL_results_barplot_postnorm_200724.pdf", h = 6, w = 8) |incomplete cannot add the low confidence information with current r version. probably need to move after confidence calculation. the read file approaches real plot than calculated one.  
ggplot(cL_df, aes(fill=type, y=scores, x=cluster)) +
  scale_fill_manual(values =Tfh_colors_list) +
  geom_bar(position="stack", stat="identity") +
  scale_x_continuous(breaks = 0:29)+
  theme_classic()

#### Supplementary Figure 4 H /| Final predicted label for each cell cluster

sctype_scores$type <- factor(sctype_scores$type, levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17",
                                                            "IFNI", "heatshock", "ribo", "cluster2", "cluster3"))

##Supplementary Figure 4HFinal predicted label for each cell cluster. 
##|wrong CLUSTER 14 (Tfh2 instead of Tfh1_CM), 19 (Tfh17 instead of cluster2) AND 28 not present in final plot

##pdf("TfhIBSM_scType_output_scType_scores_barplot_postnorm_200724.pdf", h = 6, w = 8)
ggplot(sctype_scores, aes(x=cluster, y=scores, fill=type)) +
  scale_fill_manual(values =Tfh_colors_list) +
  geom_bar(stat="identity")+theme_classic()

# ###### adding some confidence... 
# # Calculate the highest score for each cluster
# max_scores <- cL_df %>%
#   group_by(cluster) %>%
#   summarize(max_score = max(scores))
# 
# # Join the max scores back to the original data
# data_with_max <- cL_df %>%
#   left_join(max_scores, by = "cluster")
# 
# 
# # Calculate the fold change of each score over the maximum score for each cluster
# cL_df2 <- data_with_max %>%
#   mutate(confidence = max_score / scores)
# 
# # Flag each row based on confidence
# cL_df2 <- cL_df2 %>%
#   mutate(flag_stripes = case_when(
#     confidence < 0 ~ "normal",   # Flag negative values as "normal"
#     confidence == 1.0 ~ "choice", # Flag exactly 1.0000 as "choice"
#     confidence < 1.1 ~ "low",     # Flag values less than 1.1 (excluding 1.0000) as "low"
#     TRUE ~ "normal"               # Flag all other values as "normal"
#   ))
# 
# # Identify clusters with "low" flags
# clusters_with_low <- cL_df2 %>%
#   filter(flag_stripes == "low") %>%
#   group_by(cluster) %>%
#   summarize(num_low_flags = n()) %>%
#   ungroup()
# 
# # Add a column to indicate low confidence (you can customize the condition)
# cL_df2$low_confidence <- cL_df2$flag_stripes == "low"
# 
# #### Supplementary Figure 4F) scType was used to label transfer cell signatures (top 20 up and down regulated genes)70
# # for each cell subset identified in healthy ‘map’ data onto over-clustered CHMI data. Each cell cluster71
# # was scored for each reference signature (left), and the majority score (right) used to collapse clusters72
# # into cell subsets.  
# ##"TfhIBSM_scType_output_cL_results_barplot_postnorm_WITHCONFIDENCE_200724.pdf", h = 6, w = 8)
# # ggplot(cL_df2, aes(y = scores, x = cluster, fill = type, pattern = low_confidence)) +
# #   geom_bar_pattern(position = "stack", stat = "identity",
# #                    pattern_fill = "black",
# #                    pattern_angle = 45,
# #                    pattern_density = 0.1,
# #                    pattern_spacing = 0.02) +
# #   scale_fill_manual(values = Tfh_colors_list) +
# #   scale_pattern_manual(values = c(`TRUE` = "stripe", `FALSE` = "none")) +
# #   theme_classic()
# 
# # ggplot(cL_df2, aes(y = scores, x = cluster, fill = type, pattern = low_confidence)) +
# #   geom_bar_pattern(position = "stack", stat = "identity",
# #                    pattern_fill = "black",
# #                    pattern_angle = 45,
# #                    pattern_density = 0.1,
# #                    pattern_spacing = 0.02) +
# #   scale_fill_manual(values = Tfh_colors_list) +
# #   scale_pattern_manual(values = c(`TRUE` = "stripe", `FALSE` = "none")) +
# #   theme_classic()
# 
# #### that cluster 14 - scores between Tfh1/TFh1CM/Tfh2 are VERY VERY CLOSE...
# #  Tfh1CM: 4274, Tfh2: 4474

####Supplementary Figure 4 G /| Tfh cell cluster markers in each subset for manual adjustment of “low-confidence” predicted cluster identities.

DefaultAssay(Tfh.IBSM) <- "RNA"
Tfh.mixed2 <- subset(Tfh.IBSM, subset = integrated_snn_res.2.2 %in% c(14,18,19,21,20,22))

Tfh.mixed2@meta.data$integrated_snn_res.2.2 <- factor(Tfh.mixed2@meta.data$integrated_snn_res.2.2,
                                                         levels = c(14,18,20,22,19,21))

#pdf("TfhIBSM_mixedTfhs_clustercomparison_vlnplot_070824.pdf", h =8, w = 4)
viol_colors = c("#2EA174","#4594D0","#2C4B9F","#3642A3", "#1E4DA8", "#425BB0")
Stacked_VlnPlot(Tfh.mixed2, 
                features = c("CXCR3", "GZMK", "CST7", "NKG7", "CCR7", "SELL", "MAF", "TOX2", "TCF7", "CCR6", "BHLHE40"), 
                x_lab_rotate = TRUE,colors_use = viol_colors,
                group.by = "integrated_snn_res.2.2")

metadata <- as.data.frame(Tfh.IBSM.2@meta.data)

try <- metadata[, c("seurat_clusters", "day", "donor_id", "alternative_clusters")]

try.5 <- try %>%
  group_by(alternative_clusters, donor_id, day) %>%  
  summarise(n = n(), .groups = "drop") %>%  
  group_by(donor_id, day) %>%
  mutate(freq = n / sum(n))

## Add order before plotting
try.5$donor_id <- factor(try.5$donor_id, levels = c("donor0", "donor1", "donor2", "donor3"))

try.5$day <- factor(try.5$day, levels = c("day0", "day8", "day16", "day36"))

try.5$alternative_clusters <- factor(try.5$alternative_clusters, 
                                     levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17","Tfh1.17",
                                                "Tfreg", "IFNI", "heatshock", "ribo",
                                                "cluster2", "cluster3"))
##Only clusters showing 
try.5_subset <- try.5 %>%
  filter(alternative_clusters %in% c("Tfreg", "IFNI", "heatshock", "ribo",
                                                "cluster2", "cluster3"))
Supplementary Figure 4 I /| Relative proportion of each pTfh transcriptional cluster during CHMI
ggplot(data = try.5_subset, aes(x=day, y=freq)) +
  geom_line(aes(group = donor_id, colour = donor_id)) +
  geom_point(aes(group = donor_id, colour = donor_id)) + 
  geom_boxplot(data = try.5_subset, aes(x = day, y = freq), fill = "grey", width = 0.5, outlier.shape = NA) + 
  facet_wrap(~alternative_clusters, ncol = 3) + 
  theme_classic() + 
  scale_colour_manual(values = c("#d53e4f", "#3288bd", "#1a9850", "#c2a5cf"))

Supplementary Figure 5: TCR analysis of Tfh cells in CHMI

setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) # This only works if you are using RStudio

file.names <- list.files(pattern = 'filtered_contig_annotations.csv', recursive = TRUE)
names <- list.files(path='VDJ')

file_seq <- 1:length(file.names)

if(any(grepl("*.csv", file.names))==TRUE){
  ibsm_tcr <- file.names %>% 
    lapply(., function(i){
      fread(input = i)
    }) %>%
    setNames(names)
}

ibsm_tcr <- ibsm_tcr[mixedsort(names(ibsm_tcr))]

ibsm_tcr <- ibsm_tcr %>%
  map2(., file_seq, ~mutate(.x, barcode = sub(1, .y, barcode)))

names <- names(ibsm_tcr)

ibsm_tcr <- combineTCR(ibsm_tcr,
                       samples = names,
                       removeNA = TRUE,
                       removeMulti = TRUE,
                       filterMulti = TRUE)

## Now we need to edit the barcode, delete everything before "v3_"

ibsm_tcr <- ibsm_tcr %>% 
  map(~.x %>% mutate(barcode = gsub(".*VDJ_","",barcode)))

ibsm_tcr_db0 <- lapply(names(ibsm_tcr), function(name) {
                df <- ibsm_tcr[[name]]
                df$day <- name
                colnames(df)[1] <- "barcode"
                colnames(df)[2] <- "sample"
                colnames(df)[3] <- "TCR1"
                colnames(df)[4] <- "cdr3_aa1"
                colnames(df)[5] <- "cdr3_nt1"
                colnames(df)[6] <- "TCR2"
                colnames(df)[7] <- "cdr3_aa2"
                colnames(df)[8] <- "cdr3_nt2"
                colnames(df)[9] <- "CTgene"
                colnames(df)[10] <- "CTnt"
                colnames(df)[11] <- "CTaa"
                colnames(df)[12] <- "CTstrict"
                return(df)
})        
ibsm_tcr_db = as.data.frame(do.call(rbind, ibsm_tcr_db0))

write.csv(ibsm_tcr.db, "/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/ibsm40_tcr_db_30325.csv")
Import Seurat
##reading seurat object 
ibsm <- readRDS("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_TCR_4.4/IBSM40/Tfh.IBSM.2_REANNOTATION_POSTNORM_V2_200724.rds")
Combine VDJ and Seurat
ibsm_rep <- combineExpression(ibsm_tcr, ibsm,
                              cloneCall="aa", 
                              proportion = FALSE,
                              cloneSize=c(Single=1, Small=5, Medium=20, Large=100, Hyperexpanded=500))

ibsm_rep@meta.data$seurat_clusters <-  factor(ibsm_rep@meta.data$seurat_clusters, 
                                              levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17", "Tfreg", "IFNI", 
                                                         "heatshock", "ribo", "cluster2", "cluster3"))

ibsm_rep@meta.data$alternative_clusters <-  factor(ibsm_rep@meta.data$alternative_clusters, 
                                              levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17", "Tfreg", "IFNI", 
                                                         "heatshock", "ribo", "cluster2", "cluster3"))

##Determine clone order
ibsm_rep@meta.data <- ibsm_rep@meta.data %>% 
  mutate(cloneN = case_when(clonalFrequency == 1 ~ "1",
                               clonalFrequency == 2 ~ "2",
                               clonalFrequency == 3 ~ "3",
                               clonalFrequency == 4 ~ "4",
                               clonalFrequency > 5 ~ ">5")) %>% 
  mutate(cloneN = factor(cloneN, levels = c('1','2','3','4','>5')))


ibsm_rep@meta.data <- ibsm_rep@meta.data %>% 
  mutate(cloneN2 = case_when(clonalFrequency == 1 ~ "Clone # (N=1)",
                               clonalFrequency > 1 ~ "Clone # (N>1)")) %>% 
  mutate(cloneN2 = factor(cloneN2, levels = c('Clone # (N=1)','Clone # (N>1)')),
         day = factor(day, levels = c("day0", "day8", "day16", "day36")))
How many cells:TCR
cluster_col <- c("Tfh1" = "#9e0142", "Tfh1_CM" = "#f4a582", "Tfh2" = "#5e4fa2", 
                 "Tfh17" = "#66c2a5", "Tfreg" = "#74add1", 
                 "IFNI" = "#8c510a", "heatshock" = "#bf812d", "ribosomal" = "#fee090", 
                 "cluster2" = "#01665e", "cluster3" = "#b2abd2")

cluster_orders <- c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17", "Tfreg", "IFNI", "heatshock", "ribo", "cluster2", "cluster3")

donor_col <- c("donor0"="#d53e4f", "donor1"="#3288bd", "donor2"="#1a9850", "donor3" = "#c2a5cf")
cell.prop <- ibsm_rep@meta.data %>% 
  mutate(TCR = case_when(is.na(CTaa) == TRUE ~ "No",
                         TRUE ~ "Yes"))

cell.prop <- cell.prop %>% 
  group_by(alternative_clusters, TCR) %>% 
  summarise(n = n()) %>% 
  group_by(alternative_clusters) %>% 
  mutate(total = sum(n)) %>% 
  mutate(prop = n/total*100)

Supplementary Figure 5 A | Proportion of cells with TCR captured across each subset.

ggplot(cell.prop, aes(x = alternative_clusters, y = prop, fill = TCR))+
  geom_col(color = "black")+
  theme_bw()+
  ylab("Proportion")+
  geom_text(aes(label = round(prop, digits = 1)), position = position_stack(vjust = 0.5))+
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
  scale_fill_manual(values = c("grey","white"))

#### Supplementary Figure 5 B | Clonal size of other Tfh cell subsets identified over time.

ibsm_rep_subC <-  subset(ibsm_rep, subset = alternative_clusters %in% c("Tfreg", "IFNI", "heatshock", "ribo", "cluster2", "cluster3"))

ggplot(filter(ibsm_rep_subC@meta.data, !is.na(cloneN)), 
       aes(x = alternative_clusters, fill = cloneN))+
  geom_bar(colour = "black", position = position_stack(reverse = TRUE), linewidth = 0.3)+
  # geom_text(stat='count', aes(label= ..count..), vjust=-1)+
  # scale_x_discrete(limits = clone_order)+
  theme_classic()+
  # scale_fill_brewer(palette = "GnBu", direction = 1)+
  scale_fill_manual(values = c("grey","#ABDDA4", "#66C2A5", "#3288BD", "#5E4FA2"))+
  labs(y = "Count")+
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle=90))+
  facet_wrap(~day, nrow = 1)

Supplementary Figure 5 C | Clonal overlap between donor and day.

set.seed(123)
ibsm_rep@meta.data$day = factor(ibsm_rep@meta.data$day, levels = c('day0','day8','day16','day36'))
ibsm_rep@meta.data$donor_day = factor(ibsm_rep@meta.data$donor_day,
                                      levels = c('donor0_day0','donor0_day8','donor0_day16','donor0_day36',
                                                   'donor1_day0','donor1_day8','donor1_day16','donor1_day36',
                                                   'donor2_day0','donor2_day16','donor2_day36',
                                                   'donor3_day0','donor3_day8','donor3_day16','donor3_day36'))
ibsm_repMon <- subset(ibsm_rep, subset = clonalFrequency != 1)
ibsm_repMon <- subset(ibsm_repMon, subset = donor_day != 'donor2_day8')

ibsm_overlap <- clonalOverlap(ibsm_repMon, cloneCall = "aa", 
                              method = "jaccard", group.by = "donor_day", exportTable = F, palette = "plasma")+
    theme(axis.title = element_blank(),
        #axis.text = element_blank(),
        axis.text.x = element_text(angle = 90),
        axis.ticks = element_blank()
        )

meta <- ibsm_rep@meta.data %>% 
  distinct(donor_day, donor_id, day)

meta$day = factor(meta$day, levels = c('day0','day8','day16','day36'))
meta$donor_day = factor(meta$donor_day, levels = c('donor0_day0','donor0_day8','donor0_day16','donor0_day36',
                                                   'donor1_day0','donor1_day8','donor1_day16','donor1_day36',
                                                   'donor2_day0','donor2_day16','donor2_day36',
                                                   'donor3_day0','donor3_day8','donor3_day16','donor3_day36'))

donor_labs <- meta %>%
  filter(!is.na(donor_id)) %>%
  filter(!is.na(donor_day)) %>%
  ggplot(aes(donor_day, y = 1, fill = donor_id)) +
  geom_tile() +
  scale_fill_brewer(palette = 'Set1', name = "Donor ID") +
  theme_void()

day_labs <- meta %>%
  filter(!is.na(day)) %>%
  filter(!is.na(donor_day)) %>%
  ggplot(aes(donor_day, y = 1, fill = day)) +
  geom_tile() +
  scale_fill_brewer(palette = 'Set3', name = "Day") +
  theme_void()

final_plot <- day_labs / donor_labs / ibsm_overlap +
  patchwork::plot_layout(heights = c(0.05, 0.05, 1), guides = 'collect')


print(final_plot)

Supplementary Figure 5 D | Circos plot indicating clonal overlap between cell clusters and day for each donor.

ibsm_rep <- subset(ibsm_rep, subset = donor_day != 'donor2_day8')
# Create  a new column day_clust for ChordDiagram "sector"
ibsm_rep@meta.data$day_clust <- paste(ibsm_rep@meta.data$day,
                                      ibsm_rep@meta.data$alternative_clusters, sep="_")

donors <- mixedsort(unique(ibsm_rep@meta.data$donor_id))

for (i in donors){
  
  # Create circos df using getCirclize function
  ibsm_circos <- getCirclize(subset(ibsm_rep, subset = donor_id == i), 
                             group.by = "day_clust") %>% filter(., value!=0)
    
    # Metadata on naming the group
  df_names <- filter(ibsm_rep@meta.data, donor_id == i) %>% 
    select(alternative_clusters, day, day_clust) %>% 
    distinct(day_clust, .keep_all = TRUE) %>% 
    remove_rownames() %>% 
    arrange(., factor(alternative_clusters))

  
  # We need to match color day_clust variables based on cell type
  # Currently done manually, need to find easier way
  names <- unique(c(ibsm_circos$from, ibsm_circos$to))
  
  grid.col <- case_when(grepl("_Tfh1$", names) ~ "#9e0142",
                        grepl("_Tfh1_CM$", names) ~ "#f4a582",
                        grepl("Tfh2", names) ~ "#5e4fa2",
                        grepl("Tfh17", names) ~ "#66c2a5",
                        grepl("Tfreg", names) ~ "#74add1",
                        grepl("IFNI", names) ~ "#8c510a",
                        grepl("heatshock", names) ~ "#bf812d",
                        grepl("ribo", names) ~ "#fee090",
                        grepl("_cluster2$", names) ~ "#01665e",
                        grepl("_cluster3$", names) ~ "#b2abd2",
                        TRUE ~ NA)
  
  names(grid.col) <- names
  
  # We want to split the circos based on Day grouping
  group = structure(df_names$day, names = df_names$day_clust)
  group = factor(group, levels = mixedsort(unique(group)))
  
  # Now plot backbone of circos
  chordDiagram(ibsm_circos, 
               grid.col = grid.col,
               annotationTrack = "grid",
               preAllocateTracks = list(list(track.height = 0.075),
               #                          # list(track.height = 0.075),
                                        list(track.height = 0.001)),
               annotationTrackHeight = mm_h(5),
               group = group,
               big.gap = 20, small.gap = 1,
               direction.type = c("diffHeight", "arrows"))
  
  title(i)
  
  # Now manually name the inner sector based on cell type
  names <- data.frame(day_clust = get.all.sector.index()) %>% 
    left_join(., df_names, by = "day_clust")
  names <- names$alternative_clusters
  
  for(i in seq_along(names)){
    si <- get.all.sector.index()[i]
    nm <- names[i]
    
      xlim = get.cell.meta.data("xlim", sector.index = si, track.index = 2)
      ylim = get.cell.meta.data("ylim", sector.index = si, track.index = 2)
      circos.text(mean(xlim), mean(ylim), nm, sector.index = si, track.index = 2, 
          facing = "outside", niceFacing = TRUE, col = "black", cex = 0.5)
  }
  
  # Now plot outer ribbon on days
  # highlight.sector(filter(df_names, day == 'day0')$day_clust, track.index = 1, col = viridis(n=4)[1], text = "Day 0", cex = 0.75, text.col = "white", niceFacing = TRUE)
  # highlight.sector(filter(df_names, day == 'day8')$day_clust, track.index = 1, col = viridis(n=4)[2], text = "Day 8", cex = 0.75, text.col = "white", niceFacing = TRUE)
  # highlight.sector(filter(df_names, day == 'day16')$day_clust, track.index = 1, col = viridis(n=4)[3], text = "Day 16", cex = 0.75, text.col = "white", niceFacing = TRUE)
  # highlight.sector(filter(df_names, day == 'day36')$day_clust, track.index = 1, col = viridis(n=4)[4], text = "Day 36", cex = 0.75, text.col = "white", niceFacing = TRUE)

  for (j in unique(group)){
    highlight.sector(filter(df_names, day == j)$day_clust, track.index = 1, , border = "black", col = "white", text = j, cex = 0.75, text.col = "black", niceFacing = TRUE)
  }
  
  circos.clear()
}

dev.set()
## png 
##   2

Supplementary Figure 6: Tfh cell profiles at day 16 following CHMI

Reading data for Supplementary Figure 6
TfhIBSM <- readRDS("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/Tfh.IBSM.2_REANNOTATION_POSTNORM_V2_200724.rds")

Tfh_colors_list <- c("#9e0142", "#f4a582", "#5e4fa2", "#66c2a5", "#74add1",
                     "#8c510a", "#bf812d", "#fee090", "#01665e", "#b2abd2")

TfhIBSM@meta.data$alternative_clusters <- factor(TfhIBSM@meta.data$alternative_clusters, 
                                            levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17","Tfh1.17",
                                                       "Tfreg", "IFNI", "heatshock", "ribo",
                                                       "cluster2", "cluster3"))

Supplementary Figure 6 A /| Expression of activation genes for all clusters at Day 16

# Pull out just D16 cells and see..

Tfh.16 <- subset(TfhIBSM, subset = day == "day16")

#DefaultAssay(Tfh.16)

# to check if lognorm has been performed... or alternatively just run each counts/data separately, logorm will be non-integers ie 1.xxx
#Tfh.16[['RNA']]@counts@x == Tfh.16[['RNA']]@data@x

# Activation genes

Stacked_VlnPlot(Tfh.16, features = c("MKI67", "CD38", "ICOS"), x_lab_rotate = TRUE,
                group.by = "alternative_clusters", colors_use = Tfh_colors_list)

#### Supplementary Figure 6 B and C /| Despite upregulation of genes associated with inflammation, expression profiles were still different between Tfh1_cyto and Tfh1_CCR7 subsets. Tfh cell subsets maintained expression profiles of identifying cluster markers.

# Tfh1 genes..

Stacked_VlnPlot(Tfh.16, features = c("GZMK", "NKG7", "CCL5", "CXCR3"),
                x_lab_rotate = TRUE,
                group.by = "alternative_clusters", colors_use = Tfh_colors_list)

# TCM/Th2 genes...

Stacked_VlnPlot(Tfh.16, features = c("SELL", "CCR7", "TIGIT", "SH2D1A", "TCF7", "MAF"),
                x_lab_rotate = TRUE,
                group.by = "alternative_clusters", colors_use = Tfh_colors_list)

###### Getting data for Supplementary Figure 6D

TfhIBSM <- readRDS("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/Tfh.IBSM.2_REANNOTATION_POSTNORM_V2_200724.rds")

TfhIBSM@meta.data$alternative_clusters <- factor(TfhIBSM@meta.data$alternative_clusters,
                                                 levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17",
                                                            "Tfh1.17", "Tfreg", "IFNI",
                                                            "heatshock", "ribo", "cluster2",
                                                            "cluster3"))

## first we want to get a pseudobulk count matrix..

# renaming cluster for ease.. (you can skip this if you don't need)
TfhIBSM@meta.data$alternative_clusters <- gsub("Tfh1_CM", "Tfh1CM", TfhIBSM@meta.data$alternative_clusters, fixed = TRUE)

TfhIBSM@meta.data$donor_day_cluster <- paste(TfhIBSM@meta.data$donor_day, TfhIBSM@meta.data$alternative_clusters, sep = "_")

TfhIBSM@meta.data$donor_day_cluster <- factor(TfhIBSM@meta.data$donor_day_cluster)

# now getting pseudobulk counts by the combination I want: donor_day_cluster
DefaultAssay(TfhIBSM) <- "RNA"

TfhIBSM<- NormalizeData(TfhIBSM)

# this is right... you aggregate using raw counts... 
TfhIBSM.Average <- AggregateExpression(TfhIBSM, group.by = "donor_day_cluster", slot = "counts", assays = "RNA", return.seurat = T)
## Names of identity class contain underscores ('_'), replacing with dashes ('-')
## Centering and scaling data matrix
## 
## This message is displayed once every 8 hours.
### tidying up metadata naming
TfhIBSM.Average@meta.data$donor_day_cluster <- as.factor(rownames(TfhIBSM.Average@meta.data))

# Split name column into firstname and last name
TfhIBSM.Average@meta.data[c("donor", "day", "seurat_clusters")] <- str_split_fixed(TfhIBSM.Average@meta.data$donor_day_cluster, "-", 3)

# Now extract the pseudobulked log-counts (NON-SCALED)    

TfhIBSM.scalemat <- GetAssayData(TfhIBSM.Average, layer = "data") # data is already log-normalised.. 

## put in metadata
TfhIBSM.meta <- as.data.frame(TfhIBSM.Average@meta.data)
TfhIBSM.scalemat <- as.data.frame(TfhIBSM.scalemat)
TfhIBSM.scalemat <- as.data.frame(t(TfhIBSM.scalemat))

TfhIBSM.scalemat$donor_day_cluster<- rownames(TfhIBSM.scalemat)


Average.df <- merge(x = TfhIBSM.scalemat, y = TfhIBSM.meta, by = "donor_day_cluster")

# Average.df <- read.csv("TfhIBSM_Averagemat_LOGNORMALISED_donordaycluster_V2_090824.csv", row.names = 1)
rownames(Average.df) <- Average.df$donor_day_cluster

Average.df$day <- factor(Average.df$day, levels = c("day0", "day8", "day16", "day36"))
Average.df$seurat_clusters <- factor(Average.df$seurat_clusters, levels = c("Tfh1", "Tfh1CM", "Tfh2", "Tfh17",
                                                                            "Tfreg", "IFNI",
                                                                            "heatshock", "ribo", "cluster2",
                                                                            "cluster3"))

# ### subset
# Average.dfsub <- Average.df[Average.df$day == "day16", ]
# Average.dfsub <- Average.dfsub[Average.dfsub$seurat_clusters %in% c("Tfh1", "Tfh1CM", "Tfh2"), ]
# 
# 
# #### okay now to reorder EVERYTHING... 
# 
# my_order <- c("donor0-day16-Tfh1", "donor1-day16-Tfh1", "donor2-day16-Tfh1", "donor3-day16-Tfh1",
#               "donor0-day16-Tfh1CM", "donor1-day16-Tfh1CM", "donor2-day16-Tfh1CM", "donor3-day16-Tfh1CM",
#               "donor0-day16-Tfh2", "donor1-day16-Tfh2", "donor2-day16-Tfh2", "donor3-day16-Tfh2")
# 
# Average.dfsub <- Average.dfsub[match(my_order, rownames(Average.dfsub)), ]
# 
# ### now reoreder everything else I want as well...
# Average.dfsub$seurat_clusters <- factor(Average.dfsub$seurat_clusters, levels = c("Tfh1","Tfh1CM", "Tfh2"))
# Average.dfsub$donor <- factor(Average.dfsub$donor, levels = c("donor0", "donor1", "donor2", "donor3"))
upset0 <-  read.csv(file="/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/sigGenes/psbulk_sigGens_all_DayClusters_080824.csv", row.names = 1)

##Getting dataset
##Input  
## Long format: gene, pval, logFC, cluster, day
## From pseudobulk analysis subset of genes with p_adj <0.005 and
## LogFC>abs(1)

upset1 <- upset0[,c(3,8,4,1,2)]
#Change colnames
colnames(upset1)[4] ="celltype"
colnames(upset1)[3] ="Log2FC"
colnames(upset1)[2] ="p_adj"

#reorder...
upset1$day <- factor(upset1$day, levels = c("day8", "day16", "day36"))
upset1$celltype <- factor(upset1$celltype, levels = c("Tfh1", "Tfh1CM", "Tfh2", "Tfh17", "Tfreg", "IFNI", "heatshock", "ribo", "cluster2", "cluster3"))
### subset
Average.dfsub <- Average.df[Average.df$day %in% c("day0", "day16"), ]
table(Average.dfsub$day)
## 
##  day0  day8 day16 day36 
##    40     0    40     0
table(Average.dfsub$seurat_clusters)
## 
##      Tfh1    Tfh1CM      Tfh2     Tfh17     Tfreg      IFNI heatshock      ribo 
##         8         8         8         8         8         8         8         8 
##  cluster2  cluster3 
##         8         8
Average.dfsub <- Average.dfsub[Average.dfsub$seurat_clusters %in% c("Tfh1", "Tfh1CM", "Tfh2"), ]
table(Average.dfsub$seurat_clusters)
## 
##      Tfh1    Tfh1CM      Tfh2     Tfh17     Tfreg      IFNI heatshock      ribo 
##         8         8         8         0         0         0         0         0 
##  cluster2  cluster3 
##         0         0
#### okay now to reorder EVERYTHING... 

my_order <- c("donor0-day0-Tfh1", "donor1-day0-Tfh1", "donor2-day0-Tfh1", "donor3-day0-Tfh1",
              "donor0-day16-Tfh1", "donor1-day16-Tfh1", "donor2-day16-Tfh1", "donor3-day16-Tfh1",
              "donor0-day0-Tfh1CM", "donor1-day0-Tfh1CM", "donor2-day0-Tfh1CM", "donor3-day0-Tfh1CM",
              "donor0-day16-Tfh1CM", "donor1-day16-Tfh1CM", "donor2-day16-Tfh1CM", "donor3-day16-Tfh1CM",     
              "donor0-day0-Tfh2", "donor1-day0-Tfh2", "donor2-day0-Tfh2", "donor3-day0-Tfh2",
              "donor0-day16-Tfh2", "donor1-day16-Tfh2", "donor2-day16-Tfh2", "donor3-day16-Tfh2")

Average.dfsub <- Average.dfsub[match(my_order, rownames(Average.dfsub)), ]



### now reoreder everything else I want as well...
Average.dfsub$seurat_clusters <- factor(Average.dfsub$seurat_clusters, levels = c("Tfh1","Tfh1CM", "Tfh2"))
Average.dfsub$donor <- factor(Average.dfsub$donor, levels = c("donor0", "donor1", "donor2", "donor3"))
Average.dfsub$day <- factor(Average.dfsub$day, levels = c("day0", "day16"))



### subset gene lists... 
colnames(upset1)[3] <- "logFC"
Tfh1CMgenes <- upset1%>%filter(celltype=="Tfh1CM", day== "day16")
Tfh1genes <- upset1%>%filter(celltype=="Tfh1", day== "day16")
Tfh2genes <- upset1%>%filter(celltype=="Tfh2", day== "day16")


# edgeR was done - D16 v D0, so logFC< 0 = up at d16, log FC >0 = down at d16, but up at day 0 
Tfh1CMgenes$dir_sig[Tfh1CMgenes$p_adj<0.05 & Tfh1CMgenes$logFC<0] <-"up"
Tfh1CMgenes$dir_sig[Tfh1CMgenes$p_adj<0.05 & Tfh1CMgenes$logFC>0] <-"down"
Tfh1genes$dir_sig[Tfh1genes$p_adj<0.05 & Tfh1genes$logFC<0] <-"up"
Tfh1genes$dir_sig[Tfh1genes$p_adj<0.05 & Tfh1genes$logFC>0] <-"down"
Tfh2genes$dir_sig[Tfh2genes$p_adj<0.05 & Tfh2genes$logFC<0] <-"up"
Tfh2genes$dir_sig[Tfh2genes$p_adj<0.05 & Tfh2genes$logFC>0] <-"down"

# let's get D16up - top 50
Tfh1CMgenes$logFC <- Tfh1CMgenes$logFC*-1
Tfh1CMgenes <- Tfh1CMgenes[order(Tfh1CMgenes$logFC, decreasing = TRUE),]
topCM <- head(Tfh1CMgenes, n=20)
Tfh1genes$logFC <- Tfh1genes$logFC*-1
Tfh1genes <- Tfh1genes[order(Tfh1genes$logFC, decreasing = TRUE),]
top1 <- head(Tfh1genes, n=20)
Tfh2genes$logFC <- Tfh2genes$logFC *-1
Tfh2genes <- Tfh2genes[order(Tfh2genes$logFC, decreasing = TRUE),]
top2 <- head(Tfh2genes, n=20)


# Extract gene columns and merge
merged_genes <- unique(c(topCM$gene, top1$gene, top2$gene))

# Ensure only the top 50 unique genes
merged_genes <- unique(merged_genes[1:50])


dfsub_16<- Average.dfsub[, colnames(Average.dfsub) %in% merged_genes]

dfsub_16 <- as.matrix(dfsub_16)
dfsub_16 = scale(dfsub_16)


col = list(Cluster = c("Tfh1" = "#b2182b", "Tfh1CM" = "#fbb4ae", "Tfh2" = "#1f78b4"), 
           Donor = c("donor0" = "#cb7a88", "donor1" = "#ccab7b", "donor2" = "#a3cb7a", "donor3" = "#7cbecc"),
           Day = c("day0" = "#bababa", "day16" = "#fdae61"))

ha <- HeatmapAnnotation(Cluster=Average.dfsub$seurat_clusters,
                        Day=Average.dfsub$day,
                        Donor=Average.dfsub$donor,
                        col = col)

Supplementary Figure 6 D | Expression of top 50 unique upregulated at Day 16 for Tfh1_Cyto, Tfh1_CCR7 and Tfh2 clusters relative to Day 0.

Heatmap(t(dfsub_16),
        name = "RNAseq", col=mako(50),
        top_annotation = ha,
        cluster_rows = TRUE, cluster_columns = FALSE,
        column_title = "Genes", row_title = "Samples",
        row_names_gp = gpar(fontsize=6),
        column_names_gp = gpar(fontsize=4))

xfun::session_info(c("plyr", "tidyr", "dplyr", "Seurat", "patchwork", "ggplot2", "sctransform", 
  "BiocManager", "limma", "scCustomize", "qs", "viridis", "tidyverse", 
  "ggpubr", "scRepertoire", "data.table", "gtools", "ggraph", "tibble", 
  "circlize", "ggalluvial", "RColorBrewer", "rstackdeque", "ComplexHeatmap", 
  "harmony", "openxlsx", "HGNChelper", "pheatmap", "forcats", "ggthemes", 
  "ggrepel", "cowplot", "ggfortify", "gplots", "reshape2", "rstatix", "eulerr"), dependencies = FALSE)
## Registered S3 method overwritten by 'eulerr':
##   method    from  
##   plot.venn gplots
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Locale:
##   LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##   LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##   LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##   LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##   LC_ADDRESS=C               LC_TELEPHONE=C            
##   LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## Package version:
##   BiocManager_1.30.25   circlize_0.4.16       ComplexHeatmap_2.20.0
##   cowplot_1.1.3         data.table_1.16.0     dplyr_1.1.4          
##   eulerr_7.0.2          forcats_1.0.0         ggalluvial_0.12.5    
##   ggfortify_0.4.17      ggplot2_3.5.1         ggpubr_0.6.0         
##   ggraph_2.2.1          ggrepel_0.9.6         ggthemes_5.1.0       
##   gplots_3.2.0          gtools_3.9.5          harmony_1.2.3        
##   HGNChelper_0.8.15     limma_3.60.6          openxlsx_4.2.8       
##   patchwork_1.3.0       pheatmap_1.0.12       plyr_1.8.9           
##   qs_0.27.3             RColorBrewer_1.1-3    reshape2_1.4.4       
##   rstackdeque_1.1.1     rstatix_0.7.2         scCustomize_3.0.1    
##   scRepertoire_2.0.7    sctransform_0.4.1     Seurat_5.2.1         
##   tibble_3.2.1          tidyr_1.3.1           tidyverse_2.0.0      
##   viridis_0.6.5